source("../../../methods/gcqn_validated.R")
gcqn_qsmooth <- function(counts, gcGroups, bio){
  gcBinNormCounts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts), dimnames=list(rownames(counts),colnames(counts)))
  for(ii in 1:nlevels(gcGroups)){
    id <- which(gcGroups==levels(gcGroups)[ii])
    countBin <- counts[id,]
    qs <- qsmooth(countBin, group_factor=bio)
    normCountBin <- qs@qsmoothData
    normCountBin <- round(normCountBin)
    normCountBin[normCountBin<0] <- 0
    gcBinNormCounts[id,] <- normCountBin
  }
  return(gcBinNormCounts)
}

library(grDevices)
palette(c("black", "#DF536B", "#61D04F", "#2297E6", "#28E2E5", "#D03AF5", "#EEC21F", "gray62"))
library(GenomicAlignments)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: Rsamtools
library(rafalib)
library(cqn)
## Loading required package: mclust
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
## Loading required package: nor1mix
## Loading required package: preprocessCore
## Loading required package: splines
## Loading required package: quantreg
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(Biobase)
library(edgeR)
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(mclust)
library(umap)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.2     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   1.0.3
## ✔ tidyr   1.0.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks Biostrings::collapse(), IRanges::collapse()
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ purrr::compact()    masks XVector::compact()
## ✖ dplyr::count()      masks matrixStats::count()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks GenomicAlignments::first(), S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::last()       masks GenomicAlignments::last()
## ✖ purrr::map()        masks mclust::map()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ purrr::simplify()   masks DelayedArray::simplify()
## ✖ dplyr::slice()      masks XVector::slice(), IRanges::slice()
library(viridis)
## Loading required package: viridisLite
library(qsmooth)
library(RUVSeq)
## Loading required package: EDASeq
## Loading required package: ShortRead
## 
## Attaching package: 'ShortRead'
## The following object is masked from 'package:dplyr':
## 
##     id
## The following object is masked from 'package:purrr':
## 
##     compose
## The following object is masked from 'package:tibble':
## 
##     view
plotGCHex <- function(gr, counts){
  counts2 <- counts
  df <- as_tibble(cbind(counts2,gc=mcols(gr)$gc))
  df <- gather(df, sample, value, -gc)
  ggplot(data=df, aes(x=gc, y=log(value+1)) ) + ylab("log(count + 1)") + xlab("GC content") + geom_hex(bins = 50) + theme_bw() + facet_wrap(~sample, nrow=2)
}
plotRD <- function(rd, celltype, region, col=NULL, ...){
  if(is.null(col)) col <- 1:nlevels(region)
  plot(rd, pch=as.numeric(celltype)+15, col=col[region], 
       xlab="Dimension 1", ylab="Dimension 2", ...)
  legend("bottomleft", c("neuronal", "non-neuronal"), pch=c(16,17), bty='n')
  legend("topleft", levels(region), pch=16, col=col[1:nlevels(region)], bty='n')
}

# counts <- as.matrix(read.table("../../../data/fullard2019/boca_raw_count_matrix.tsv", header=TRUE, stringsAsFactors = FALSE))
# peaks <- read.table("../../../data/fullard2019/boca_peaks_consensus_no_blacklisted_regions.bed", header=FALSE, stringsAsFactors = FALSE)
# colnames(peaks) <- c("chromosome", "start", "end", "name")
countsURL <- "https://bendlj01.u.hpc.mssm.edu/multireg/resources/boca_raw_count_matrix.tsv.gz"
download.file(countsURL, destfile="~/tmp/countsFullard.tsv.gz")
counts <- as.matrix(read.table("~/tmp/countsFullard.tsv.gz", header=TRUE, stringsAsFactors = FALSE))

peaksURL <- "https://bendlj01.u.hpc.mssm.edu/multireg/resources/boca_peaks_consensus_no_blacklisted_regions.bed"
download.file(peaksURL, destfile="~/tmp/peaksFullard.bed")
peaks <- read.table("~/tmp/peaksFullard.bed", header=FALSE, stringsAsFactors = FALSE)
colnames(peaks) <- c("chromosome", "start", "end", "name")


peakNames <- peaks$name
sn <- substr(peaks$chromosome, 4, sapply(peaks$chromosome, nchar))
start <- peaks$start
end <- peaks$end
gr <- GRanges(seqnames=sn, ranges = IRanges(start=start, end=end), strand="*")
names(gr) <- peaks$name

# get GC content
ff <- FaFile("~/data/genomes/human/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa.gz")
peakSeqs <- getSeq(x=ff, gr)
gcContentPeaks <- letterFrequency(peakSeqs, "GC",as.prob=TRUE)[,1]
gcGroups <- Hmisc::cut2(gcContentPeaks, g=20)
mcols(gr)$gc <- gcContentPeaks

# design
# the data should consist of 2 cell types (neurons and non-neurons) across 14 distinct brain regions of 5 individuals
colnames(counts) <- gsub(x=colnames(counts),pattern="^X", replacement="")
individual <- factor(unlist(lapply(strsplit(colnames(counts), split="_"), "[[", 1)))
names(individual) <- colnames(counts)
celltype <- factor(unlist(lapply(strsplit(colnames(counts), split="_"), "[[", 2)))
names(celltype) <- colnames(counts)
region <- factor(unlist(lapply(strsplit(colnames(counts), split="_"), "[[", 3)))
names(region) <- colnames(counts)
## they also define groups of regions: https://bendlj01.u.hpc.mssm.edu/multireg/
regionBig <- as.character(region)
regionBig[region %in% c("DLPFC", "OFC", "VLPFC", "ACC", "STC", "ITC", "PMC", "INS")] <- "NCX"
regionBig[region %in% c("NAC", "PUT")] <- "STR"
regionBig <- factor(regionBig)
rawCounts <- counts

Get normalized count matrices

DESeq2

library(DESeq2)
dds <- DESeqDataSetFromMatrix(counts,
                              colData=data.frame(individual, celltype, region),
                              design=as.formula("~ individual + celltype*region"))
# calculate DESeq2 size factors
dds <- estimateSizeFactors(dds)
# extract the size factors and save them in an object
sf <- sizeFactors(dds)
# divide each row by the vector of size factors
normCountsDESeq2 <- sweep(counts, 2, sizeFactors(dds), "/")

edgeR

library(edgeR)
d <- DGEList(counts)
d <- calcNormFactors(d)
nf <- d$samples$norm.factors
effLibSize <- colSums(counts) * nf
effLibSize_scaled <- effLibSize / mean(effLibSize)
# divide each row by the vector of size factors
normCountsEdgeR <- sweep(counts, 2, effLibSize_scaled, "/")

Full quantile normalization

normCountsFQ <- FQnorm(counts, type="median")

cqn

# run cqn model
cqnObj <- cqn(counts, x=gcContentPeaks, lengths=width(gr))
## Warning: The use of 'sig2' is deprecated; do specify 'sigma' (= sqrt(sig2))
## instead
# calculate normalized counts
normCountsCqn <- 2^(cqnObj$y + cqnObj$offset)

cqnplot(cqnObj, n=1)

cqnplot(cqnObj, n=2)

EDASeq

library(EDASeq)
wit <- withinLaneNormalization(as.matrix(counts), gcContentPeaks, which="full")
normCountsEDASeq <- betweenLaneNormalization(wit, which="full")
rm(wit)

GC-QN

gcGroups <- Hmisc::cut2(gcContentPeaks, g=50)
normCountsGcqn <- gcqn(counts, gcGroups, "median")

Smooth GC-QN

normCountsGcqnSmooth <- gcqn_qsmooth(counts, gcGroups, bio=droplevels(interaction(celltype, region)))

PCA

set.seed(44)

pcaPlot <- function(pca, regionBig, celltype){
  require(ggplot2)
  pctVar <- pca$sdev^2 / sum(pca$sdev^2)
  df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2])
  df %>% ggplot(., aes(x=PC1, y=PC2, colour=regionBig, shape=celltype)) +
    geom_point(size=3) +
    theme_classic() + 
    xlab(paste0("PC 1 (",round(pctVar[1],3)*100,"%)")) +
    ylab(paste0("PC 2 (",round(pctVar[2],3)*100,"%)"))
}



normMethods <- c("Raw", "DESeq2", "EdgeR", "FQ", "EDASeq", "Cqn", "Gcqn", "GcqnSmooth")
titles <- c("None", "DESeq2", "edgeR", "Full quantile", "FQ-FQ", "cqn", "GC-FQ", "smooth GC-FQ")
pcaPlots <- list()
for(mm in 1:length(normMethods)){
    curMethod <- normMethods[mm]
    if(curMethod == "Raw"){
      curCounts <- as.matrix(counts)
    } else {
      curCounts <- as.matrix(get(paste0("normCounts",curMethod)))
    }
    pca <- prcomp(t(log1p(curCounts)))
    pctVar <- pca$sdev^2 / sum(pca$sdev^2)
    pcaPlots[[mm]] <- pcaPlot(pca, regionBig, celltype) + ggtitle(titles[mm])
}
names(pcaPlots) <- normMethods
cowplot::plot_grid(plotlist=pcaPlots)

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_pcaPlots.pdf", width=12, height=12)

Hierarchical tree

library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.13.2
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:Biostrings':
## 
##     nnodes
## The following object is masked from 'package:stats':
## 
##     cutree
ddRaw <- dist(log1p(t(counts)), method="euclidean")
ddEdgeR <- dist(log1p(t(normCountsEdgeR)), method="euclidean")
ddDESeq2 <- dist(log1p(t(normCountsDESeq2)), method="euclidean")
ddFQ <- dist(log1p(t(normCountsFQ)), method="euclidean")
ddCqn <- dist(log1p(t(normCountsCqn)), method="euclidean")
ddEDASeq <- dist(log1p(t(normCountsEDASeq)), method="euclidean")
ddGCQN <- dist(log1p(t(normCountsGcqn)), method="euclidean")
ddGCQNSmooth <- dist(log1p(t(normCountsGcqnSmooth)), method="euclidean")


plotDD <- function(dist, col, label, main, cex=1/2, ...){
  require(dendextend)
  hdd <- hclust(dist)
  dhdd <- as.dendrogram(hdd)
  origLabels <- labels(dhdd)
  labels(dhdd) <- label[origLabels]
  labels_colors(dhdd) <- col[origLabels]
  dhdd <- set(dhdd, "labels_cex", cex)
  plot(dhdd, main=main, ...)
}


## cell type effect
colCelltype <- as.numeric(celltype)
names(colCelltype) <- names(celltype)
# pdf("~/Dropbox/research/atacseq/bulk/plots/hclust_celltype_fullard2018.pdf",
#     width=7, height=5)
plotDD(ddRaw, col=colCelltype, label=celltype, main="Raw counts: celltype effect")

plotDD(ddEdgeR, col=colCelltype, label=celltype, main="edgeR norm: celltype effect")

plotDD(ddDESeq2, col=colCelltype, label=celltype, main="DESeq2 norm: celltype effect")

plotDD(ddFQ, col=colCelltype, label=celltype, main="FQ norm: celltype effect")

plotDD(ddCqn, col=colCelltype, label=celltype, main="Cqn norm: celltype effect")

plotDD(ddEDASeq, col=colCelltype, label=celltype, main="FQ-FQ norm: celltype effect")

plotDD(ddGCQN, col=colCelltype, label=celltype, main="GC-FQ norm: celltype effect")

plotDD(ddGCQNSmooth, col=colCelltype, label=celltype, main="smooth GC-FQ norm: celltype effect")

# dev.off()

## individual effect
colIndividual <- as.numeric(individual)
names(colIndividual) <- names(individual)
plotDD(ddRaw, col=colIndividual, label=individual, main="Raw counts: individual effect")

plotDD(ddEdgeR, col=colIndividual, label=individual, main="edgeR norm: individual effect")

plotDD(ddDESeq2, col=colIndividual, label=individual, main="DESeq2 norm: individual effect")

plotDD(ddFQ, col=colIndividual, label=individual, main="FQ norm: individual effect")

plotDD(ddCqn, col=colIndividual, label=individual, main="Cqn norm: individual effect")

plotDD(ddEDASeq, col=colIndividual, label=individual, main="EDASeq norm: individual effect")

plotDD(ddGCQN, col=colIndividual, label=individual, main="GC-QN norm: individual effect")

plotDD(ddGCQNSmooth, col=colIndividual, label=individual, main="smooth GC-QN norm: individual effect")

## region effect
colRegion <- as.numeric(region)
names(colRegion) <- names(region)
plotDD(ddRaw, col=colRegion, label=region, main="Raw counts: region effect")

plotDD(ddEdgeR, col=colRegion, label=region, main="edgeR norm: region effect")

plotDD(ddDESeq2, col=colRegion, label=region, main="DESeq2 norm: region effect")

plotDD(ddFQ, col=colRegion, label=region, main="FQ norm: region effect")

plotDD(ddCqn, col=colRegion, label=region, main="Cqn norm: region effect")

plotDD(ddEDASeq, col=colRegion, label=region, main="EDASeq norm: region effect")

plotDD(ddGCQN, col=colRegion, label=region, main="GC-QN norm: region effect")

plotDD(ddGCQNSmooth, col=colRegion, label=region, main="smooth GC-QN norm: region effect")

clustering based on PCA

library(irlba)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## The following object is masked from 'package:S4Vectors':
## 
##     expand
library(uwot)
## 
## Attaching package: 'uwot'
## The following object is masked from 'package:umap':
## 
##     umap
library(cluster)
library(mclust)
library(ggplot2)
umapDR <- function(counts, nPC=6){
  pc <- irlba::irlba(log1p(counts), nv=nPC)
  umDR <- uwot::umap(pc$v)
  return(umDR)
}

pcs <- c(2, 5, 8, 10)
ariMatCT <- matrix(NA, nrow=8, ncol=length(pcs))
rownames(ariMatCT) <- c("Raw", "edgeR", "DESeq2", "FQ", "Cqn", "EDASeq", "GC-QN", "smooth GC-QN")
ariMatRegion <- matrix(NA, nrow=8, ncol=length(pcs))
rownames(ariMatRegion) <- c("Raw", "edgeR", "DESeq2", "FQ", "Cqn", "EDASeq", "GC-QN", "smooth GC-QN")
for(pp in 1:length(pcs)){
  set.seed(pp)
  nPC <- pcs[pp]
  
  umRaw <- prcomp(t(log1p(counts)))$x[,1:nPC]
  umEdgeR <- prcomp(t(log1p(normCountsEdgeR)))$x[,1:nPC]
  umDESeq2 <- prcomp(t(log1p(normCountsDESeq2)))$x[,1:nPC]
  umFQ <- prcomp(t(log1p(normCountsFQ)))$x[,1:nPC]
  umCqn <- prcomp(t(log1p(normCountsCqn)))$x[,1:nPC]
  umEDASeq <- prcomp(t(log1p(normCountsEDASeq)))$x[,1:nPC]
  umGcqn <- prcomp(t(log1p(normCountsGcqn)))$x[,1:nPC]
  umGcqnSmooth <- prcomp(t(log1p(normCountsGcqnSmooth)))$x[,1:nPC]

  
  plotRD(umRaw, celltype, regionBig, main="Raw counts")
  plotRD(umEdgeR, celltype, regionBig, main="edgeR counts")
  plotRD(umDESeq2, celltype, regionBig, main="DESeq2 counts")
  plotRD(umFQ, celltype, regionBig, main="FQ counts")
  plotRD(umCqn, celltype, regionBig, main="Cqn counts")
  plotRD(umEDASeq, celltype, regionBig, main="EDASeq counts")
  plotRD(umGcqn, celltype, regionBig, main="GCQN counts")
  plotRD(umGcqnSmooth, celltype, regionBig, main="smooth GCQN counts")

  ariMatCT[1,pp] <- mclust::adjustedRandIndex(pam(umRaw, k=2)$clustering, celltype)
  ariMatCT[2,pp] <- mclust::adjustedRandIndex(pam(umEdgeR, k=2)$clustering, celltype)
  ariMatCT[3,pp] <- mclust::adjustedRandIndex(pam(umDESeq2, k=2)$clustering, celltype)
  ariMatCT[4,pp] <- mclust::adjustedRandIndex(pam(umFQ, k=2)$clustering, celltype)
  ariMatCT[5,pp] <- mclust::adjustedRandIndex(pam(umCqn, k=2)$clustering, celltype)
  ariMatCT[6,pp] <- mclust::adjustedRandIndex(pam(umEDASeq, k=2)$clustering, celltype)
  ariMatCT[7,pp] <- mclust::adjustedRandIndex(pam(umGcqn, k=2)$clustering, celltype)
  ariMatCT[8,pp] <- mclust::adjustedRandIndex(pam(umGcqnSmooth, k=2)$clustering, celltype)

  ariMatRegion[1,pp] <- mclust::adjustedRandIndex(pam(umRaw, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[2,pp] <- mclust::adjustedRandIndex(pam(umEdgeR, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[3,pp] <- mclust::adjustedRandIndex(pam(umDESeq2, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[4,pp] <- mclust::adjustedRandIndex(pam(umFQ, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[5,pp] <- mclust::adjustedRandIndex(pam(umCqn, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[6,pp] <- mclust::adjustedRandIndex(pam(umEDASeq, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[7,pp] <- mclust::adjustedRandIndex(pam(umGcqn, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[8,pp] <- mclust::adjustedRandIndex(pam(umGcqnSmooth, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))

}

ariPlot <- function(ariMat){
  ariMat <- as.data.frame(ariMat)
  colnames(ariMat) <- paste0(pcs,"PC")
  ariMatLong <- tidyr::gather(ariMat)
    ariMatLong$method <- factor(rep(c("Raw", "edgeR", "DESeq2", "FQ", "Cqn", "FQ-FQ", "GC-FQ", "smooth GC-FQ"), length(pcs)))
  ariMatLong$gc <- ifelse(ariMatLong$method %in% c("Cqn", "FQ-FQ", "GC-FQ", "smooth GC-FQ"), TRUE, FALSE)
  ariMatLong$key <- factor(ariMatLong$key, levels=c("2PC", "5PC", "8PC", "10PC"))
  
  ggplot(ariMatLong, aes(x=key, y=value, color=method, shape=gc)) +
    geom_jitter(width=.1, height=0, size=3) +
    theme_classic() +
    xlab("Number of PCs") +
    ylab("Adjusted Rand Index") +
    scale_shape_discrete(name="GC-aware")
}

# for clustering according to cell type, the decrease in ARI is lower for GC-aware methods
# possibly because they don't let technical GC effects contaminate the PCs.
ariPlot(ariMatCT)

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard2018_ariCelltype.pdf", width = 8)
ariRegion <- ariPlot(ariMatRegion)
ariRegion

barplot(rowMeans(ariMatRegion), las=2)

DA analysis, neuronal vs non-neuronal

Ignoring the individual effects and making the notation simpler by only working with 4 regions, the contrast is derived as follows.

\[ \eta_{ij} = \beta_0 + \beta_1 x_{NN} + \beta_2 x_{r2} + \beta_3 x_{r3} + \beta_4 x_{r4} + \beta_5 x_{r2:NN} + \beta_6 x_{r3:NN} + \beta_7 x_{r4:NN} \]

Where \(NN\) represents non-neuronal cell type, and \(r_i\) represents region \(i\).

The average of the neuronal cell type is

\[ \frac{1}{4} \left\{ \beta_0 + (\beta_0 + \beta_2) + (\beta_0 + \beta_3) + (\beta_0 + \beta_4) \right\}. \]

The average of the non-neuronal cell type is

\[ \frac{1}{4} \left\{ (\beta_0 + \beta_1) + (\beta_0 + \beta_1 + \beta_2 + \beta_5) + (\beta_0 + \beta_1 + \beta_3 + \beta_6) + (\beta_0 + \beta_1 + \beta_4 + \beta_7) \right\}. \]

Their difference, non-neuronal minus neuronal, is

\[ \beta_0 + \beta_1 + \frac{1}{4}\left( \beta_2 + \beta_5 + \beta_3 + \beta_6 + \beta_4 + \beta_7 \right) - \left\{ \beta_0 + \frac{1}{4}\left( \beta_2 + \beta_3 + \beta_4 \right) \right\} \]

\[ = \beta_1 + \frac{1}{4} (\beta_5 + \beta_6 + \beta_7). \]

This is the contrast we need. A similar derivation holds for 14 regions.

MA-plots and intersections

testEdgeR <- function(counts, design, L, tmm=FALSE, offset=NULL){
  d <- DGEList(counts)
  if(tmm) d <- calcNormFactors(d)
  if(!is.null(offset)) d$offset <- offset
  d <- estimateDisp(d, design)
  fit <- glmFit(d, design)
  lrt <- glmLRT(fit, contrast=L)
  lrt$table$padj <- p.adjust(lrt$table$PValue, "fdr")
  return(lrt)
}

covarDf <- data.frame(region, celltype, individual)
testDESeq2 <- function(counts, covarDf, L){
  dds <- DESeqDataSetFromMatrix(counts,
                              colData=covarDf,
                              design=as.formula("~ region*celltype + individual"))
  dds <- estimateSizeFactors(dds)
  dds <- DESeq(dds)
  res <- results(dds, contrast = L)
  return(res)
}

design <- model.matrix(~region*celltype + individual)
L_neur <- matrix(0, nrow=ncol(design), ncol=1)
rownames(L_neur) <- colnames(design)
L_neur["celltypeN",1] <- 1 #main neuronal effect
L_neur[grep(x=rownames(L_neur), pattern=":celltypeN", fixed=TRUE),1] <- 1/14
normMethods <- c("Raw", "DESeq2", "EdgeR", "FQ", "EDASeq", "Cqn", "Gcqn", "GcqnSmooth")
lrtOut <- list()
for(mm in 1:length(normMethods)){
    curMethod <- normMethods[mm]
    if(curMethod == "Raw"){
      curCounts <- as.matrix(rawCounts)
      lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur)
      next
    } else if (curMethod == "EdgeR"){
      curCounts <- as.matrix(rawCounts)
      lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur, tmm=TRUE)
      next
    } else if (curMethod == "DESeq2"){
      curCounts <- as.matrix(rawCounts)
      lrtOut[[mm]] <- testDESeq2(curCounts, covarDf, L_neur)
      next
    } else if (curMethod == "Cqn"){
      curCounts <- as.matrix(rawCounts)
      lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur, offset=cqnObj$glm.offset)
      next
    } else if(curMethod == "Cqn_noLength"){
      curCounts <- as.matrix(rawCounts)
      lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur, offset=cqnObj_noLength$glm.offset)
      next
    } else if(curMethod == "RUV"){
      curCounts <- as.matrix(rawCounts)
      lrtOut[[mm]] <- testEdgeR(curCounts, designRUV, L_neurRUV)
    } else if(curMethod %in% c("FQ", "EDASeq", "Gcqn", "GcqnSmooth")){
      curCounts <- as.matrix(get(paste0("normCounts",curMethod)))
      lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur)
    }
}
# saveRDS(lrtOut, file="lrtOut3.rds")
lrtOut <- readRDS("lrtOut3.rds")
maPlotGC <- function(M, A, gc){
  require(ggplot2)
  df <- data.frame(M, A, gc)
  df %>% ggplot(., aes(x=M, y=A, colour=gc)) +
    geom_point(size=1/2, alpha=1/4) +
    scale_color_gradientn(colours=wesanderson::wes_palette("Zissou1", n=10, type="continuous"), name="GC content", breaks=quantile(gc, probs=seq(0,1,length=10)), labels=NULL) +
    theme_classic() + 
    theme(legend.position = "none") +
    geom_hline(aes(yintercept=0)) +
    geom_smooth(data=df, mapping=aes(x=M, y=A), se=FALSE) +
    xlab("Average log counts per million") +
    ylab("Log2 fold change")
}

maPlotGC_hex <- function(M, A, gc, ng=20, bins=18){
  trunc.median <- function(x, gcContentPeaks){
    xBar <- median(x)
    if(xBar > quantile(gcContentPeaks, .98)){
      xBar <- quantile(gcContentPeaks, .98)
    } else if(xBar < quantile(gcContentPeaks, .02)){
      xBar <- quantile(gcContentPeaks, .02)
    }
    return(xBar)
  }
  require(ggplot2)
  df <- data.frame(M, A, gc)
  vals <- quantile(gcContentPeaks/max(gcContentPeaks), probs = seq(0,1,by=1/ng))
  vals[1] <- 0
  vals[ng] <- 1
  df %>% ggplot(., aes(x=M, y=A)) +
        stat_summary_hex(aes(z = gc), fun="mean", 
                         show.legend=FALSE, bins = bins) +
    #stat_summary_hex(aes(z = gc), fun=trunc.median, fun.args = list(gcContentPeaks = gc), show.legend=FALSE, bins = bins) +
    theme_classic() +
    # scale_fill_gradientn(colours=wesanderson::wes_palette("Zissou1", n=ng, type="continuous"), name="GC content", labels=NULL, limits=c(0.2,0.85)) +
    scale_fill_gradientn(colours=wesanderson::wes_palette("Zissou1", n=ng, type="continuous"), name="GC content", labels=NULL, values=vals) +
    geom_hline(aes(yintercept=0)) +
    geom_smooth(data=df, mapping=aes(x=M, y=A), se=FALSE) +
    xlab("Average log counts per million") +
    ylab("Log2 fold change")
}

maPlotDensity_hex <- function(M, A, gc, ng=20){
  require(ggplot2)
  df <- data.frame(M, A, gc)
  df %>% ggplot(., aes(x=M, y=A)) +
    geom_hex(show.legend = FALSE) +
    theme_classic() +
    geom_hline(aes(yintercept=0)) +
    geom_smooth(data=df, mapping=aes(x=M, y=A), se=FALSE) +
    xlab("Average log counts per million") +
    ylab("Log2 fold change")
}

gcBoxplot <- function(df, title){
  ggplot(df) +
  aes(x=gc, y=logFC, color=gc) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(df$gc), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() +
  ylim(c(-2,2)) +
  xlab("GC-content bin") +
  ggtitle(title) +
  theme(axis.text.x = element_text(size=0),
        legend.position = "none",
        axis.title = element_text(size=16))
}



titles <- c("Raw", "DESeq2", "edgeR", "FQ", "FQ-FQ", "cqn", "GC-FQ", "smooth GC-FQ")

pList <- c()
for(kk in 1:length(lrtOut)){
  if(kk == 2){
    pList[[kk]] <- maPlotGC(M=aveLogCPM(normCountsDESeq2), A=lrtOut[[kk]]$log2FoldChange, gc=gcContentPeaks) +
    ggtitle(titles[kk])
  } else {
    pList[[kk]] <- maPlotGC(M=lrtOut[[kk]]$table$logCPM, A=lrtOut[[kk]]$table$logFC, gc=gcContentPeaks) +
    ggtitle(titles[kk])
  }
}
cowplot::plot_grid(plotlist=pList)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_maPlots.pdf", width=8, height=8)
# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_maPlots.png", width=8, height=8, dpi=150)


## MA hex plots, colored acc to GC
pListHex <- c()
for(kk in 1:length(lrtOut)){
  if(kk == 2){
    pListHex[[kk]] <- maPlotGC_hex(M=aveLogCPM(normCountsDESeq2), A=lrtOut[[kk]]$log2FoldChange, gc=gcContentPeaks, ng=40, bins=50) +
    ggtitle(titles[kk]) + 
    ylim(c(-5, 4.5)) + 
    xlim(c(-1.5, 7)) + 
      coord_fixed()
  } else {
  pListHex[[kk]] <- maPlotGC_hex(M=lrtOut[[kk]]$table$logCPM, A=lrtOut[[kk]]$table$logFC, gc=gcContentPeaks, ng=40, bins=50) +
    ggtitle(titles[kk]) + 
    ylim(c(-5, 4.5)) + 
    xlim(c(-1.5, 7)) + 
      coord_fixed()
  }
}
names(pListHex) <- titles
cowplot::plot_grid(plotlist=pListHex)
## Warning: Removed 5 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 2 rows containing missing values (geom_hex).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 20 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 20 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 9 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 3 rows containing missing values (geom_hex).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 4 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 5 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 2 rows containing missing values (geom_hex).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 2 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 3 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_maPlots_hex.pdf", width=8, height=8)

## MA hex plots, colored acc to density
pListDensHex <- c()
for(kk in 1:length(lrtOut)){
  if(kk == 2){
    pListDensHex[[kk]] <- maPlotDensity_hex(M=aveLogCPM(normCountsDESeq2), A=lrtOut[[kk]]$log2FoldChange, gc=gcContentPeaks) +
    ggtitle(titles[kk]) + ylim(c(-5,5)) + coord_fixed()
  } else {
  pListDensHex[[kk]] <- maPlotDensity_hex(M=lrtOut[[kk]]$table$logCPM, A=lrtOut[[kk]]$table$logFC, gc=gcContentPeaks) +
    ggtitle(titles[kk]) + ylim(c(-5,5)) + coord_fixed()
  }
}
names(pListDensHex) <- titles
cowplot::plot_grid(plotlist=pListDensHex)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing missing values (geom_hex).
## Warning: Removed 15 rows containing non-finite values (stat_binhex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_hex).
## Warning: Removed 1 rows containing non-finite values (stat_binhex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_maPlots_density_hex.pdf", width=8, height=8)


pListBox <- list()
gcGroups20 <- Hmisc::cut2(gcContentPeaks, g=20)
for(kk in 1:length(lrtOut)){
  if(kk == 2){
    df <- data.frame(gc=gcGroups20, logFC=lrtOut[[kk]]$log2FoldChange)
    pListBox[[kk]] <- gcBoxplot(df, titles[kk])
  } else {
    df <- data.frame(gc=gcGroups20, logFC=lrtOut[[kk]]$table$logFC)
    pListBox[[kk]] <- gcBoxplot(df, titles[kk]) 
  }
}
cowplot::plot_grid(plotlist=pListBox)
## Warning: Removed 11288 rows containing non-finite values (stat_ydensity).
## Warning: Removed 11288 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11857 rows containing non-finite values (stat_ydensity).
## Warning: Removed 11857 rows containing non-finite values (stat_boxplot).
## Warning: Removed 13394 rows containing non-finite values (stat_ydensity).
## Warning: Removed 13394 rows containing non-finite values (stat_boxplot).
## Warning: Removed 13955 rows containing non-finite values (stat_ydensity).
## Warning: Removed 13955 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11054 rows containing non-finite values (stat_ydensity).
## Warning: Removed 11054 rows containing non-finite values (stat_boxplot).
## Warning: Removed 18600 rows containing non-finite values (stat_ydensity).
## Warning: Removed 18600 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12582 rows containing non-finite values (stat_ydensity).
## Warning: Removed 12582 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10579 rows containing non-finite values (stat_ydensity).
## Warning: Removed 10579 rows containing non-finite values (stat_boxplot).

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_boxplots.pdf", width=16, height=10)


library(UpSetR)
daPeaks <- list()
for(kk in 1:length(lrtOut)){
  if(kk == 2){
    daPeaks[[kk]] <- which(lrtOut[[kk]]$padj <= 0.05)
  } else {
    daPeaks[[kk]] <- which(p.adjust(lrtOut[[kk]]$table$PValue, "fdr") <= 0.05)
  }
}
barplot(unlist(lapply(daPeaks, length)),
        names=normMethods, ylab="Nr DA peaks")

names(daPeaks) <- titles 
# pdf("~/Dropbox/research/atacseq/bulk/plots/fullard_upset.pdf")
upset(fromList(daPeaks), nsets=9, nintersects=10, keep.order=TRUE, order.by="freq")

# dev.off()

Plot for paper

M=lrtOut[[1]]$table$logCPM
A=lrtOut[[1]]$table$logFC
gc=gcContentPeaks
df <- data.frame(M, A, gc)
hlp <- df %>% ggplot(., aes(x=M, y=A)) +
    stat_summary_hex(aes(z = gc)) +
    theme_classic() +
    scale_fill_gradientn(colours=wesanderson::wes_palette("Zissou1", n=10, type="continuous"), name="GC content", labels=NULL) +
    geom_hline(aes(yintercept=0)) +
    geom_smooth(data=df, mapping=aes(x=M, y=A), se=FALSE) +
    xlab("Average log counts per million") +
    ylab("Log2 fold change")
leg <- ggpubr::get_legend(hlp)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cols <- wesanderson::wes_palette("Zissou1", n=10, type="continuous")

pcaAriPlots <- cowplot::plot_grid(pcaPlots[["GcqnSmooth"]] + ggtitle("") + labs(colour="Region", shape="Cell type") + coord_fixed(), 
                                  ariRegion + scale_color_viridis_d(),
                                  nrow=2, ncol=1, labels=c("a", "b"))
# fix scales
maList <- list(pListHex[["edgeR"]]+ 
    ylim(c(-5, 4.5)) + 
    xlim(c(-1.5, 7)),
               pListHex[["FQ"]] + 
    ylim(c(-5, 4.5)) + 
    xlim(c(-1.5, 7)),
               pListHex[["cqn"]] + 
    ylim(c(-5, 4.5)) + 
    xlim(c(-1.5, 7)),
               pListHex[["smooth GC-FQ"]] + 
    ylim(c(-5, 4.5)) + 
    xlim(c(-1.5, 7)))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
maPlots <- cowplot::plot_grid(plotlist=maList,
                     nrow=2, ncol=2)
## Warning: Removed 9 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 3 rows containing missing values (geom_hex).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 4 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 5 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 2 rows containing missing values (geom_hex).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning: Removed 3 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
cowplot::plot_grid(pcaAriPlots,
                   maPlots,
                   leg,
                   labels=c("", "c",""),
                   nrow=1, ncol=3,
                   rel_widths = c(1,1.2,.3))

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_mainFigure.pdf", width=13, height=9)

Biological interpretation

Overlap with known features

# overlap with known genomic features
# note that hg19 is synonym for GRCh37
library(ChIPpeakAnno)
## Loading required package: grid
## Loading required package: VennDiagram
## Loading required package: futile.logger
## 
## Attaching package: 'VennDiagram'
## The following object is masked from 'package:dendextend':
## 
##     rotate
## 
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
library(EnsDb.Hsapiens.v75)
## Loading required package: ensembldb
## Loading required package: AnnotationFilter
## 
## Attaching package: 'ensembldb'
## The following object is masked from 'package:dplyr':
## 
##     filter
## The following object is masked from 'package:stats':
## 
##     filter
library(org.Hs.eg.db)
## 
data(TSS.human.GRCh37)
peakFeaturesAll <- suppressWarnings(assignChromosomeRegion(gr, nucleotideLevel=FALSE,
                                                  TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
barplot(peakFeaturesAll$percentage, las=1) # across all peaks

## for the DA peaks of each method
peakFeaturesList <- list()
for(kk in 1:length(daPeaks)){
  peakFeaturesList[[kk]] <- suppressWarnings(assignChromosomeRegion(gr[daPeaks[[kk]]],
                                                                    nucleotideLevel=FALSE,
                                                  TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)$percentage)
}
names(peakFeaturesList) <- names(daPeaks)

df <- data.frame(feature = unlist(lapply(peakFeaturesList, names)),
                 percentage = unlist(peakFeaturesList),
                 method = rep(names(peakFeaturesList), each=7))
ggplot(df, aes(x=1,y=percentage, col=method)) +
  theme_classic() +
  geom_jitter(width=.1, height=0) +
  facet_wrap(.~feature, scales="free_y") +
  xlim(c(0.7, 1.3)) +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.line.x = element_blank())

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_featuresAllMethods.pdf", width=7, height=5)

GO enrichment on intersection

Enriched GO terms on intersection of peaks make a lot of sense.

intersectDAPeaks <- Reduce(intersect, daPeaks)
peakFeaturesIst <- suppressWarnings(assignChromosomeRegion(gr[intersectDAPeaks],
                                                           nucleotideLevel=FALSE,
                                                  TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
# intersection is enriched in expected biological features
# pdf("~/Dropbox/research/atacseq/bulk/plots/")
barplot(peakFeaturesIst$percentage / peakFeaturesAll$percentage) ; abline(h=1, lty=2)

annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
sn <- as.character(seqnames(annoData))
sn <- factor(gsub(x=sn, pattern="chr", replacement=""))
annoData <- GRanges(seqnames=sn, ranges=ranges(annoData), strand=strand(annoData),
                    mcols=mcols(annoData))
istPeaksAnnotated <- suppressWarnings(annotatePeakInBatch(gr[intersectDAPeaks],
                                                           AnnotationData=annoData,
                                                         output="nearestBiDirectionalPromoters",
                                                         bindingRegion=c(-2000, 500)))
## Annotate peaks by annoPeaks, see ?annoPeaks for details.
## maxgap will be ignored.
goRes <- getEnrichedGO(istPeaksAnnotated, orgAnn="org.Hs.eg.db", maxP=.05, 
                       minGOterm=10, multiAdjMethod="BH", condense=FALSE)
goResUniq <- lapply(goRes, function(tab){
  tab2 <- unique(tab[,1:10])
  tab2 <- tab2[order(tab2$pvalue, decreasing=FALSE),]
  rownames(tab2) <- NULL
  tab2
  })

head(goResUniq$bp)
##        go.id                       go.term
## 1 GO:0007399    nervous system development
## 2 GO:0022008                  neurogenesis
## 3 GO:0048666            neuron development
## 4 GO:0030182        neuron differentiation
## 5 GO:0031175 neuron projection development
## 6 GO:0048699         generation of neurons
##                                                                                                                                                                                                                                                        Definition
## 1                                                                                                                                      The process whose specific outcome is the progression of nervous tissue over time, from its formation to its mature state.
## 2                                                                                                                                                                                                                  Generation of cells within the nervous system.
## 3                                                                               The process whose specific outcome is the progression of a neuron over time, from initial commitment of the cell to a specific fate, to the fully functional differentiated cell.
## 4                                                                                                                                                                 The process in which a relatively unspecialized cell acquires specialized features of a neuron.
## 5 The process whose specific outcome is the progression of a neuron projection over time, from its formation to the mature structure. A neuron projection is any process extending from a neural cell, such as axons or dendrites (collectively called neurites).
## 6                                                                                                                             The process in which nerve cells are generated. This includes the production of neuroblasts and their differentiation into neurons.
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       BP 9.252788e-10             960           2375             549339
## 2       BP 3.859252e-09             672           1623             549339
## 3       BP 1.613004e-08             472           1110             549339
## 4       BP 1.861189e-08             569           1365             549339
## 5       BP 2.438052e-08             420            978             549339
## 6       BP 4.895908e-08             626           1524             549339
##   totaltermInGenome BH.adjusted.p.value
## 1           1593489        1.174641e-05
## 2           1593489        2.449660e-05
## 3           1593489        5.906949e-05
## 4           1593489        5.906949e-05
## 5           1593489        6.190213e-05
## 6           1593489        1.035893e-04
head(goResUniq$mf)
##        go.id                                    go.term
## 1 GO:0042393                            histone binding
## 2 GO:0019899                             enzyme binding
## 3 GO:0034979 NAD-dependent protein deacetylase activity
## 4 GO:0017136 NAD-dependent histone deacetylase activity
##                                                                                                                                                                                                                                                                                                                          Definition
## 1 Interacting selectively and non-covalently with a histone, any of a group of water-soluble proteins found in association with the DNA of eukaroytic chromosomes. They are involved in the condensation and coiling of chromosomes during cell division and have also been implicated in nonspecific suppression of gene activity.
## 2                                                                                                                                                                                                                                                                       Interacting selectively and non-covalently with any enzyme.
## 3                                                                                                                                                                                                                                              Catalysis of the removal of one or more acetyl groups from a protein, requiring NAD.
## 4                                                                                                                               Catalysis of the reaction: histone N6-acetyl-L-lysine + H2O = histone L-lysine + acetate. This reaction requires the presence of NAD, and represents the removal of an acetyl group from a histone.
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       MF 1.092682e-05              96            197              86430
## 2       MF 1.139281e-05             845           2217              86430
## 3       MF 1.438213e-05              14             16              86430
## 4       MF 3.739649e-05              13             15              86430
##   totaltermInGenome BH.adjusted.p.value
## 1            255581          0.01527862
## 2            255581          0.01527862
## 3            255581          0.01527862
## 4            255581          0.02979565
head(goResUniq$cc)
##        go.id                     go.term
## 1 GO:0005654                 nucleoplasm
## 2 GO:1902494           catalytic complex
## 3 GO:0045202                     synapse
## 4 GO:0044451            nucleoplasm part
## 5 GO:0097458                 neuron part
## 6 GO:0036477 somatodendritic compartment
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Definition
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           That part of the nuclear content other than the chromosomes or the nucleolus.
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <NA>
## 3 The junction between a nerve fiber of one neuron and another neuron, muscle fiber or glial cell. As the nerve fiber approaches the synapse it enlarges into a specialized structure, the presynaptic nerve ending, which contains mitochondria and synaptic vesicles. At the tip of the nerve ending is the presynaptic membrane; facing it, and separated from it by a minute cleft (the synaptic cleft) is a specialized area of membrane on the receiving cell, known as the postsynaptic membrane. In response to the arrival of nerve impulses, the presynaptic nerve ending secretes molecules of neurotransmitters into the synaptic cleft. These diffuse across the cleft and transmit the signal to the postsynaptic membrane.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  Any constituent part of the nucleoplasm, that part of the nuclear content other than the chromosomes or the nucleolus.
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Any constituent part of a neuron, the basic cellular unit of nervous tissue. A typical neuron consists of a cell body (often called the soma), an axon, and dendrites. Their purpose is to receive, conduct, and transmit impulses in the nervous system.
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  The region of a neuron that includes the cell body (cell soma) and dendrite(s), but excludes the axon.
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       CC 9.877700e-11            1335           3512             152412
## 2       CC 2.163806e-08             548           1370             152412
## 3       CC 1.745873e-07             469           1171             152412
## 4       CC 5.503868e-07             449           1126             152412
## 5       CC 1.104473e-06             663           1729             152412
## 6       CC 3.718441e-06             340            843             152412
##   totaltermInGenome BH.adjusted.p.value
## 1            463063        1.634759e-07
## 2            463063        1.790550e-05
## 3            463063        9.631398e-05
## 4            463063        2.277226e-04
## 5            463063        3.655806e-04
## 6            463063        1.025670e-03
xtable::xtable(head(goResUniq$bp[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Tue Sep 28 08:15:29 2021
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## 1 & GO:0007399 & nervous system development & 0.0000 \\ 
##   2 & GO:0022008 & neurogenesis & 0.0000 \\ 
##   3 & GO:0048666 & neuron development & 0.0001 \\ 
##   4 & GO:0030182 & neuron differentiation & 0.0001 \\ 
##   5 & GO:0031175 & neuron projection development & 0.0001 \\ 
##   6 & GO:0048699 & generation of neurons & 0.0001 \\ 
##   7 & GO:0045664 & regulation of neuron differentiation & 0.0005 \\ 
##   8 & GO:0032990 & cell part morphogenesis & 0.0005 \\ 
##   9 & GO:0050767 & regulation of neurogenesis & 0.0005 \\ 
##   10 & GO:0007417 & central nervous system development & 0.0005 \\ 
##   11 & GO:0031329 & regulation of cellular catabolic process & 0.0007 \\ 
##   12 & GO:0051960 & regulation of nervous system development & 0.0007 \\ 
##   13 & GO:0030030 & cell projection organization & 0.0007 \\ 
##   14 & GO:0000122 & negative regulation of transcription by RNA polymerase II & 0.0007 \\ 
##   15 & GO:0007420 & brain development & 0.0007 \\ 
##   16 & GO:0030900 & forebrain development & 0.0007 \\ 
##   17 & GO:0120036 & plasma membrane bounded cell projection organization & 0.0007 \\ 
##   18 & GO:0098732 & macromolecule deacylation & 0.0008 \\ 
##   19 & GO:0048812 & neuron projection morphogenesis & 0.0008 \\ 
##   20 & GO:0009894 & regulation of catabolic process & 0.0008 \\ 
##    \hline
## \end{tabular}
## \end{table}
xtable::xtable(head(goResUniq$cc[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Tue Sep 28 08:15:29 2021
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## 1 & GO:0005654 & nucleoplasm & 0.0000 \\ 
##   2 & GO:1902494 & catalytic complex & 0.0000 \\ 
##   3 & GO:0045202 & synapse & 0.0001 \\ 
##   4 & GO:0044451 & nucleoplasm part & 0.0002 \\ 
##   5 & GO:0097458 & neuron part & 0.0004 \\ 
##   6 & GO:0036477 & somatodendritic compartment & 0.0010 \\ 
##   7 & GO:0044456 & synapse part & 0.0011 \\ 
##   8 & GO:0097447 & dendritic tree & 0.0014 \\ 
##   9 & GO:0030425 & dendrite & 0.0014 \\ 
##   10 & GO:0005829 & cytosol & 0.0015 \\ 
##   11 & GO:1990234 & transferase complex & 0.0019 \\ 
##   12 & GO:0099572 & postsynaptic specialization & 0.0025 \\ 
##   13 & GO:0098794 & postsynapse & 0.0034 \\ 
##   14 & GO:0098982 & GABA-ergic synapse & 0.0048 \\ 
##   15 & GO:0042995 & cell projection & 0.0048 \\ 
##   16 & GO:0043005 & neuron projection & 0.0051 \\ 
##   17 & GO:0030054 & cell junction & 0.0051 \\ 
##   18 & GO:0098984 & neuron to neuron synapse & 0.0059 \\ 
##   19 & GO:0014069 & postsynaptic density & 0.0061 \\ 
##   20 & GO:0120025 & plasma membrane bounded cell projection & 0.0061 \\ 
##    \hline
## \end{tabular}
## \end{table}
sum(goResUniq$bp$BH.adjusted.p.value <= 0.05)
## [1] 68

GO enrichment on peaks only found by cqn

cqnPeaks <- daPeaks$cqn
otherPeaks <- unique(do.call(c, daPeaks[!names(daPeaks) %in% "cqn"]))
cqnUniquePeaks <- cqnPeaks[!cqnPeaks %in% otherPeaks]

### interesting genomic features are depleted, re-encouraging these could be FP results
peakFeaturesCqn <- suppressWarnings(assignChromosomeRegion(gr[cqnUniquePeaks],
                                                           nucleotideLevel=FALSE,
                                                  TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
barplot(peakFeaturesCqn$percentage / peakFeaturesAll$percentage) ; abline(h=1, lty=2)

## GO enrichment recovers no gene sets.
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
sn <- as.character(seqnames(annoData))
sn <- factor(gsub(x=sn, pattern="chr", replacement=""))
annoData <- GRanges(seqnames=sn, ranges=ranges(annoData), strand=strand(annoData),
                    mcols=mcols(annoData))
cqnPeaksAnnotated <- suppressWarnings(annotatePeakInBatch(gr[cqnUniquePeaks],
                                                           AnnotationData=annoData,
                                                         output="nearestBiDirectionalPromoters",
                                                         bindingRegion=c(-2000, 500)))
## Annotate peaks by annoPeaks, see ?annoPeaks for details.
## maxgap will be ignored.
goResCqn <- getEnrichedGO(cqnPeaksAnnotated, orgAnn="org.Hs.eg.db", maxP=1, 
                       minGOterm=10, multiAdjMethod="BH", condense=FALSE)
goResCqnUniq <- lapply(goResCqn, function(tab){
  tab2 <- unique(tab[,1:10])
  tab2 <- tab2[order(tab2$pvalue, decreasing=FALSE),]
  rownames(tab2) <- NULL
  tab2
  })

head(goResCqnUniq$bp)
##        go.id                          go.term
## 1 GO:0010842           retina layer formation
## 2 GO:0072080       nephron tubule development
## 3 GO:0060993             kidney morphogenesis
## 4 GO:0061326         renal tubule development
## 5 GO:0072078     nephron tubule morphogenesis
## 6 GO:0072088 nephron epithelium morphogenesis
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Definition
## 1 The process in which the vertebrate retina is organized into three laminae: the outer nuclear layer (ONL), which contains photoreceptor nuclei; the inner nuclear layer (INL), which contains amacrine, bipolar and horizontal cells; and the retinal ganglion cell (RGC) layer. Between the inner and outer nuclear layers, the outer plexiform layer (OPL) contains connections between the photoreceptors and bipolar and horizontal cells. The inner plexiform layer (IPL) is positioned between the INL and the ganglion cell layer and contains the dendrites of RGCs and processes of bipolar and amacrine cells. Spanning all layers of the retina are the radially oriented Mueller glia.
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          The progression of a nephron tubule over time, from its initial formation to the mature structure. A nephron tubule is an epithelial tube that is part of the nephron, the functional part of the kidney.
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      Morphogenesis of a kidney. A kidney is an organ that filters the blood and excretes the end products of body metabolism in the form of urine.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                The progression of the renal tubule over time from its formation to the mature form. A renal tubule is a tube that filters, re-absorbs and secretes substances to rid an organism of waste and to play a role in fluid homeostasis.
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             The process in which the anatomical structures of a nephron tubule are generated and organized. A nephron tubule is an epithelial tube that is part of the nephron, the functional part of the kidney.
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     The process in which the anatomical structures of the nephron epithelium are generated and organized. The nephron epithelium is a tissue that covers the surface of a nephron.
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       BP 0.0001373078               4             22              19494
## 2       BP 0.0001549540               7             93              19494
## 3       BP 0.0001656586               7             94              19494
## 4       BP 0.0001769532               7             95              19494
## 5       BP 0.0003047489               6             74              19494
## 6       BP 0.0003522613               6             76              19494
##   totaltermInGenome BH.adjusted.p.value
## 1           1593489           0.1252829
## 2           1593489           0.1252829
## 3           1593489           0.1252829
## 4           1593489           0.1252829
## 5           1593489           0.1313617
## 6           1593489           0.1313617
head(goResCqnUniq$mf)
##        go.id                             go.term
## 1 GO:0019869 chloride channel inhibitor activity
## 2 GO:1901612                 cardiolipin binding
## 3 GO:0001664  G protein-coupled receptor binding
## 4 GO:0042562                     hormone binding
## 5 GO:0005126           cytokine receptor binding
## 6 GO:0009055          electron transfer activity
##                                                                                                                                                                                                                                                                                                                                                     Definition
## 1                                                                                                                                                                                                                                                                                              Stops, prevents, or reduces the activity of a chloride channel.
## 2                                                                                                                                                                                                                                                                                                                                                         <NA>
## 3                                                                                                                                                                                                                                                                                Interacting selectively and non-covalently with a G protein-coupled receptor.
## 4                                                                                                                             Interacting selectively and non-covalently with any hormone, naturally occurring substances secreted by specialized cells that affect the metabolism or behavior of other cells possessing functional receptors for the hormone.
## 5                                                                                                                                                                                                                                                                                         Interacting selectively and non-covalently with a cytokine receptor.
## 6 Any molecular entity that serves as an electron acceptor and electron donor in an electron transport chain. An electron transport chain is a process in which a series of electron carriers operate together to transfer electrons from donors to any of several different terminal electron acceptors to generate a transmembrane electrochemical gradient.
##   Ontology      pvalue count.InDataset count.InGenome totaltermInDataset
## 1       MF 0.006355624               2             10               3139
## 2       MF 0.007704947               2             11               3139
## 3       MF 0.008412651               9            280               3139
## 4       MF 0.008696842               5            102               3139
## 5       MF 0.009576964               9            286               3139
## 6       MF 0.013606798               5            114               3139
##   totaltermInGenome BH.adjusted.p.value
## 1            255581           0.2769748
## 2            255581           0.2769748
## 3            255581           0.2769748
## 4            255581           0.2769748
## 5            255581           0.2769748
## 6            255581           0.2769748
head(goResCqnUniq$cc)
##        go.id                   go.term
## 1 GO:0005923 bicellular tight junction
## 2 GO:0070160            tight junction
## 3 GO:0060198 clathrin-sculpted vesicle
## 4 GO:0043296   apical junction complex
## 5 GO:0005765        lysosomal membrane
## 6 GO:0098852    lytic vacuole membrane
##                                                                                                                                                                                                                                                                                                                                                                                                                                         Definition
## 1 An occluding cell-cell junction that is composed of a branching network of sealing strands that completely encircles the apical end of each cell in an epithelial sheet; the outer leaflets of the two interacting plasma membranes are seen to be tightly apposed where sealing strands are present. Each sealing strand is composed of a long row of transmembrane adhesion proteins embedded in each of the two interacting plasma membranes.
## 2                                                                                                                                                                                                                                                                                A cell-cell junction that seals cells together in an epithelium in a way that prevents even small molecules from leaking from one side of the sheet to the other.
## 3                                                                                                                                                                                                                                                                                                                                                              A clathrin-sculpted lipid bilayer membrane-enclosed vesicle after clathrin release.
## 4 A functional unit located near the cell apex at the points of contact between epithelial cells, which in vertebrates is composed of the tight junction, the zonula adherens, and desmosomes and in some invertebrates, such as Drosophila, is composed of the subapical complex (SAC), the zonula adherens and the septate junction. Functions in the regulation of cell polarity, tissue integrity and intercellular adhesion and permeability.
## 5                                                                                                                                                                                                                                                                                                                                                  The lipid bilayer surrounding the lysosome and separating its contents from the cell cytoplasm.
## 6                                                                                                                                                                                                                                                                                                                                        The lipid bilayer surrounding a lytic vacuole and separating its contents from the cytoplasm of the cell.
##   Ontology      pvalue count.InDataset count.InGenome totaltermInDataset
## 1       CC 0.004722150               6            123               5801
## 2       CC 0.005719920               6            128               5801
## 3       CC 0.009527055               2             12               5801
## 4       CC 0.009631775               6            143               5801
## 5       CC 0.014974646              10            354               5801
## 6       CC 0.015241153              10            355               5801
##   totaltermInGenome BH.adjusted.p.value
## 1            463063           0.4509883
## 2            463063           0.4509883
## 3            463063           0.4509883
## 4            463063           0.4509883
## 5            463063           0.4841307
## 6            463063           0.4841307
xtable::xtable(head(goResCqnUniq$bp[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Tue Sep 28 08:16:19 2021
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## 1 & GO:0010842 & retina layer formation & 0.1253 \\ 
##   2 & GO:0072080 & nephron tubule development & 0.1253 \\ 
##   3 & GO:0060993 & kidney morphogenesis & 0.1253 \\ 
##   4 & GO:0061326 & renal tubule development & 0.1253 \\ 
##   5 & GO:0072078 & nephron tubule morphogenesis & 0.1314 \\ 
##   6 & GO:0072088 & nephron epithelium morphogenesis & 0.1314 \\ 
##   7 & GO:0061333 & renal tubule morphogenesis & 0.1314 \\ 
##   8 & GO:0072028 & nephron morphogenesis & 0.1314 \\ 
##   9 & GO:0072009 & nephron epithelium development & 0.1314 \\ 
##   10 & GO:0060675 & ureteric bud morphogenesis & 0.2517 \\ 
##   11 & GO:0001657 & ureteric bud development & 0.2517 \\ 
##   12 & GO:0010874 & regulation of cholesterol efflux & 0.2517 \\ 
##   13 & GO:0072171 & mesonephric tubule morphogenesis & 0.2517 \\ 
##   14 & GO:0072163 & mesonephric epithelium development & 0.2517 \\ 
##   15 & GO:0072164 & mesonephric tubule development & 0.2517 \\ 
##   16 & GO:0010985 & negative regulation of lipoprotein particle clearance & 0.2711 \\ 
##   17 & GO:0001823 & mesonephros development & 0.2796 \\ 
##   18 & GO:0072073 & kidney epithelium development & 0.2881 \\ 
##   19 & GO:0072006 & nephron development & 0.2987 \\ 
##   20 & GO:0048593 & camera-type eye morphogenesis & 0.3765 \\ 
##    \hline
## \end{tabular}
## \end{table}
xtable::xtable(head(goResCqnUniq$cc[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Tue Sep 28 08:16:19 2021
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## 1 & GO:0005923 & bicellular tight junction & 0.4510 \\ 
##   2 & GO:0070160 & tight junction & 0.4510 \\ 
##   3 & GO:0060198 & clathrin-sculpted vesicle & 0.4510 \\ 
##   4 & GO:0043296 & apical junction complex & 0.4510 \\ 
##   5 & GO:0005765 & lysosomal membrane & 0.4841 \\ 
##   6 & GO:0098852 & lytic vacuole membrane & 0.4841 \\ 
##   7 & GO:0005789 & endoplasmic reticulum membrane & 0.5204 \\ 
##   8 & GO:0098827 & endoplasmic reticulum subcompartment & 0.5204 \\ 
##   9 & GO:0043034 & costamere & 0.5204 \\ 
##   10 & GO:0042175 & nuclear outer membrane-endoplasmic reticulum membrane network & 0.5204 \\ 
##   11 & GO:0044432 & endoplasmic reticulum part & 0.5204 \\ 
##   12 & GO:0150034 & distal axon & 0.5650 \\ 
##   13 & GO:0044306 & neuron projection terminus & 0.5650 \\ 
##   14 & GO:0005774 & vacuolar membrane & 0.5650 \\ 
##   15 & GO:0044445 & cytosolic part & 0.5650 \\ 
##   16 & GO:0022625 & cytosolic large ribosomal subunit & 0.5650 \\ 
##   17 & GO:0031224 & intrinsic component of membrane & 0.5670 \\ 
##   18 & GO:0043025 & neuronal cell body & 0.5670 \\ 
##   19 & GO:0022626 & cytosolic ribosome & 0.5714 \\ 
##   20 & GO:0005801 & cis-Golgi network & 0.5721 \\ 
##    \hline
## \end{tabular}
## \end{table}
# fraction annotated
length(cqnPeaksAnnotated) / length(cqnUniquePeaks)
## [1] 0.02989516

GO enrichment on peaks only found by FQ-FQ and (smooth) GC-FQ but not FQ

# gcfqPeaks <- unique(c(daPeaks$`FQ-FQ`, daPeaks$`GC-FQ`, daPeaks$`smooth GC-FQ`))
gcfqPeaks <- list(daPeaks$`FQ-FQ`, daPeaks$`GC-FQ`, daPeaks$`smooth GC-FQ`)
gcfqPeaks <- Reduce(intersect, gcfqPeaks)
fqPeaks <- daPeaks$FQ
gcfqUniquePeaks <- gcfqPeaks[!gcfqPeaks %in% fqPeaks]

### genomic features
peakFeaturesGcfq <- suppressWarnings(assignChromosomeRegion(gr[gcfqUniquePeaks],
                                                           nucleotideLevel=FALSE,
                                                  TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
barplot(peakFeaturesGcfq$percentage / peakFeaturesAll$percentage) ; abline(h=1, lty=2)

## GO enrichment
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
sn <- as.character(seqnames(annoData))
sn <- factor(gsub(x=sn, pattern="chr", replacement=""))
annoData <- GRanges(seqnames=sn, ranges=ranges(annoData), strand=strand(annoData),
                    mcols=mcols(annoData))
gcfqPeaksAnnotated <- suppressWarnings(annotatePeakInBatch(gr[gcfqUniquePeaks],
                                                           AnnotationData=annoData,
                                                         output="nearestBiDirectionalPromoters",
                                                         bindingRegion=c(-2000, 500)))
## Annotate peaks by annoPeaks, see ?annoPeaks for details.
## maxgap will be ignored.
goResgcfq <- getEnrichedGO(gcfqPeaksAnnotated, orgAnn="org.Hs.eg.db", maxP=1, 
                       minGOterm=10, multiAdjMethod="BH", condense=FALSE)
goResGcfqUniq <- lapply(goResgcfq, function(tab){
  tab2 <- unique(tab[,1:10])
  tab2 <- tab2[order(tab2$pvalue, decreasing=FALSE),]
  rownames(tab2) <- NULL
  tab2
  })

head(goResGcfqUniq$bp)
##        go.id                                     go.term
## 1 GO:0050803 regulation of synapse structure or activity
## 2 GO:0032922     circadian regulation of gene expression
## 3 GO:0007416                            synapse assembly
## 4 GO:0050808                        synapse organization
## 5 GO:0001964                            startle response
## 6 GO:0050807          regulation of synapse organization
##                                                                                                                                                                                                                          Definition
## 1                                                                        Any process that modulates the physical form or the activity of a synapse, the junction between a neuron and a target (neuron, muscle, or secretory cell).
## 2                                                                   Any process that modulates the frequency, rate or extent of gene expression such that an expression pattern recurs with a regularity of approximately 24 hours.
## 3                                                                           The aggregation, arrangement and bonding together of a set of components to form a synapse.  This process ends when the synapse is mature (functional).
## 4 A process that is carried out at the cellular level which results in the assembly, arrangement of constituent parts, or disassembly of a synapse, the junction between a neuron and a target (neuron, muscle, or secretory cell).
## 5                                                                                                                                                     An action or movement due to the application of a sudden unexpected stimulus.
## 6                                                                                        Any process that modulates the physical form of a synapse, the junction between a neuron and a target (neuron, muscle, or secretory cell).
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       BP 3.387435e-05              24            227              66581
## 2       BP 4.770692e-05              11             62              66581
## 3       BP 5.820621e-05              20            177              66581
## 4       BP 6.027712e-05              35            408              66581
## 5       BP 9.414306e-05               7             27              66581
## 6       BP 1.390087e-04              22            218              66581
##   totaltermInGenome BH.adjusted.p.value
## 1           1593489          0.09650366
## 2           1593489          0.09650366
## 3           1593489          0.09650366
## 4           1593489          0.09650366
## 5           1593489          0.12057843
## 6           1593489          0.14836859
head(goResGcfqUniq$mf)
##        go.id                                        go.term
## 1 GO:0015085 calcium ion transmembrane transporter activity
## 2 GO:0005262                       calcium channel activity
## 3 GO:0008324      cation transmembrane transporter activity
## 4 GO:0005245         voltage-gated calcium channel activity
## 5 GO:0008022                     protein C-terminus binding
## 6 GO:0046873   metal ion transmembrane transporter activity
##                                                                                                                                                                                                                         Definition
## 1                                                                                                                                              Enables the transfer of calcium (Ca) ions from one side of a membrane to the other.
## 2                       Enables the facilitated diffusion of a calcium ion (by an energy-independent process) involving passage through a transmembrane aqueous pore or channel without evidence for a carrier-mediated mechanism.
## 3                                                                                                                                                         Enables the transfer of cation from one side of a membrane to the other.
## 4                   Enables the transmembrane transfer of a calcium ion by a voltage-gated channel. A voltage-gated channel is a channel whose open state is dependent on the voltage across the membrane in which it is embedded.
## 5 Interacting selectively and non-covalently with a protein C-terminus, the end of any peptide chain at which the 1-carboxy function of a constituent amino acid is not attached in peptide linkage to another amino-acid residue.
## 6                                                                                                                                                     Enables the transfer of metal ions from one side of a membrane to the other.
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       MF 0.0002732808              16            139              10780
## 2       MF 0.0007174203              14            123              10780
## 3       MF 0.0008038738              45            644              10780
## 4       MF 0.0008316532               8             48              10780
## 5       MF 0.0010322785              18            187              10780
## 6       MF 0.0011116586              33            438              10780
##   totaltermInGenome BH.adjusted.p.value
## 1            255581           0.1827966
## 2            255581           0.1827966
## 3            255581           0.1827966
## 4            255581           0.1827966
## 5            255581           0.1827966
## 6            255581           0.1827966
head(goResGcfqUniq$cc)
##        go.id               go.term
## 1 GO:0044456          synapse part
## 2 GO:0045202               synapse
## 3 GO:0097060     synaptic membrane
## 4 GO:0098978 glutamatergic synapse
## 5 GO:0098794           postsynapse
## 6 GO:0045211 postsynaptic membrane
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Definition
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Any constituent part of a synapse, the junction between a nerve fiber of one neuron and another neuron or muscle fiber or glial cell.
## 2 The junction between a nerve fiber of one neuron and another neuron, muscle fiber or glial cell. As the nerve fiber approaches the synapse it enlarges into a specialized structure, the presynaptic nerve ending, which contains mitochondria and synaptic vesicles. At the tip of the nerve ending is the presynaptic membrane; facing it, and separated from it by a minute cleft (the synaptic cleft) is a specialized area of membrane on the receiving cell, known as the postsynaptic membrane. In response to the arrival of nerve impulses, the presynaptic nerve ending secretes molecules of neurotransmitters into the synaptic cleft. These diffuse across the cleft and transmit the signal to the postsynaptic membrane.
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      A specialized area of membrane on either the presynaptic or the postsynaptic side of a synapse, the junction between a nerve fiber of one neuron and another neuron or muscle fiber or glial cell.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    A synapse that uses glutamate as a neurotransmitter.
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           The part of a synapse that is part of the post-synaptic cell.
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                            A specialized area of membrane facing the presynaptic membrane on the tip of the nerve ending and separated from it by a minute cleft (the synaptic cleft). Neurotransmitters cross the synaptic cleft and transmit the signal to the postsynaptic membrane.
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       CC 4.107520e-07              74            938              19654
## 2       CC 1.831570e-06              85           1171              19654
## 3       CC 4.527379e-06              40            432              19654
## 4       CC 7.730741e-06              34            349              19654
## 5       CC 2.352577e-05              49            613              19654
## 6       CC 2.524514e-05              31            323              19654
##   totaltermInGenome BH.adjusted.p.value
## 1            463063        0.0003524252
## 2            463063        0.0007857434
## 3            463063        0.0012948305
## 4            463063        0.0016582440
## 5            463063        0.0036100555
## 6            463063        0.0036100555
xtable::xtable(head(goResGcfqUniq$bp[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Tue Sep 28 08:17:16 2021
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## 1 & GO:0050803 & regulation of synapse structure or activity & 0.0965 \\ 
##   2 & GO:0032922 & circadian regulation of gene expression & 0.0965 \\ 
##   3 & GO:0007416 & synapse assembly & 0.0965 \\ 
##   4 & GO:0050808 & synapse organization & 0.0965 \\ 
##   5 & GO:0001964 & startle response & 0.1206 \\ 
##   6 & GO:0050807 & regulation of synapse organization & 0.1484 \\ 
##   7 & GO:0060291 & long-term synaptic potentiation & 0.2113 \\ 
##   8 & GO:0099536 & synaptic signaling & 0.2113 \\ 
##   9 & GO:0007268 & chemical synaptic transmission & 0.2113 \\ 
##   10 & GO:0098916 & anterograde trans-synaptic signaling & 0.2113 \\ 
##   11 & GO:0097120 & receptor localization to synapse & 0.2113 \\ 
##   12 & GO:0051963 & regulation of synapse assembly & 0.2113 \\ 
##   13 & GO:0007623 & circadian rhythm & 0.2113 \\ 
##   14 & GO:0051965 & positive regulation of synapse assembly & 0.2113 \\ 
##   15 & GO:0099537 & trans-synaptic signaling & 0.2113 \\ 
##   16 & GO:0099632 & protein transport within plasma membrane & 0.2113 \\ 
##   17 & GO:0099637 & neurotransmitter receptor transport & 0.2113 \\ 
##   18 & GO:0050806 & positive regulation of synaptic transmission & 0.2372 \\ 
##   19 & GO:0050804 & modulation of chemical synaptic transmission & 0.2372 \\ 
##   20 & GO:0033555 & multicellular organismal response to stress & 0.2372 \\ 
##    \hline
## \end{tabular}
## \end{table}
xtable::xtable(head(goResGcfqUniq$cc[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Tue Sep 28 08:17:16 2021
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## 1 & GO:0044456 & synapse part & 0.0004 \\ 
##   2 & GO:0045202 & synapse & 0.0008 \\ 
##   3 & GO:0097060 & synaptic membrane & 0.0013 \\ 
##   4 & GO:0098978 & glutamatergic synapse & 0.0017 \\ 
##   5 & GO:0098794 & postsynapse & 0.0036 \\ 
##   6 & GO:0045211 & postsynaptic membrane & 0.0036 \\ 
##   7 & GO:0031594 & neuromuscular junction & 0.0101 \\ 
##   8 & GO:0097458 & neuron part & 0.0101 \\ 
##   9 & GO:0036477 & somatodendritic compartment & 0.0155 \\ 
##   10 & GO:1904813 & ficolin-1-rich granule lumen & 0.0210 \\ 
##   11 & GO:0043005 & neuron projection & 0.0210 \\ 
##   12 & GO:0034703 & cation channel complex & 0.0361 \\ 
##   13 & GO:0030425 & dendrite & 0.0498 \\ 
##   14 & GO:0097447 & dendritic tree & 0.0498 \\ 
##   15 & GO:0043197 & dendritic spine & 0.0529 \\ 
##   16 & GO:0044309 & neuron spine & 0.0565 \\ 
##   17 & GO:0044297 & cell body & 0.0623 \\ 
##   18 & GO:0008328 & ionotropic glutamate receptor complex & 0.0623 \\ 
##   19 & GO:0099055 & integral component of postsynaptic membrane & 0.0653 \\ 
##   20 & GO:0098878 & neurotransmitter receptor complex & 0.0723 \\ 
##    \hline
## \end{tabular}
## \end{table}
# fraction annotated
length(gcfqPeaksAnnotated) / length(gcfqUniquePeaks)
## [1] 0.1437259

Genomic feature plots

#pdf("~/Dropbox/research/atacseq/bulk/plots/fullard_genomicFeatures.pdf", width=9)
par(mfrow=c(3,1))
barplot(peakFeaturesIst$percentage / peakFeaturesAll$percentage,
        main="Intersection across normalization methods",
        xlab="Genomic feature", ylab="Enrichment relative to all peaks") ; abline(h=1, lty=2, col="red", lwd=2)
barplot(peakFeaturesGcfq$percentage / peakFeaturesAll$percentage,
        main="(smooth) GC-FQ and FQ-FQ but not FQ",
        xlab="Genomic feature", ylab="Enrichment relative to all peaks") ; abline(h=1, lty=2, col="red", lwd=2)
barplot(peakFeaturesCqn$percentage / peakFeaturesAll$percentage,
        main="Peaks only found by cqn",
        xlab="Genomic feature", ylab="Enrichment relative to all peaks") ; abline(h=1, lty=2, col="red", lwd=2)

#dev.off()

Overlap using annotatr

library(annotatr)
grep(x=annotatr::builtin_annotations(),
     pattern="hg19",
     value=TRUE)
##   [1] "hg19_genes_1to5kb"                    
##   [2] "hg19_genes_promoters"                 
##   [3] "hg19_genes_cds"                       
##   [4] "hg19_genes_5UTRs"                     
##   [5] "hg19_genes_exons"                     
##   [6] "hg19_genes_firstexons"                
##   [7] "hg19_genes_introns"                   
##   [8] "hg19_genes_intronexonboundaries"      
##   [9] "hg19_genes_exonintronboundaries"      
##  [10] "hg19_genes_3UTRs"                     
##  [11] "hg19_genes_intergenic"                
##  [12] "hg19_cpg_islands"                     
##  [13] "hg19_cpg_shores"                      
##  [14] "hg19_cpg_shelves"                     
##  [15] "hg19_cpg_inter"                       
##  [16] "hg19_chromatin_Gm12878-ActivePromoter"
##  [17] "hg19_chromatin_H1hesc-ActivePromoter" 
##  [18] "hg19_chromatin_Hepg2-ActivePromoter"  
##  [19] "hg19_chromatin_Hmec-ActivePromoter"   
##  [20] "hg19_chromatin_Hsmm-ActivePromoter"   
##  [21] "hg19_chromatin_Huvec-ActivePromoter"  
##  [22] "hg19_chromatin_K562-ActivePromoter"   
##  [23] "hg19_chromatin_Nhek-ActivePromoter"   
##  [24] "hg19_chromatin_Nhlf-ActivePromoter"   
##  [25] "hg19_chromatin_Gm12878-WeakPromoter"  
##  [26] "hg19_chromatin_H1hesc-WeakPromoter"   
##  [27] "hg19_chromatin_Hepg2-WeakPromoter"    
##  [28] "hg19_chromatin_Hmec-WeakPromoter"     
##  [29] "hg19_chromatin_Hsmm-WeakPromoter"     
##  [30] "hg19_chromatin_Huvec-WeakPromoter"    
##  [31] "hg19_chromatin_K562-WeakPromoter"     
##  [32] "hg19_chromatin_Nhek-WeakPromoter"     
##  [33] "hg19_chromatin_Nhlf-WeakPromoter"     
##  [34] "hg19_chromatin_Gm12878-PoisedPromoter"
##  [35] "hg19_chromatin_H1hesc-PoisedPromoter" 
##  [36] "hg19_chromatin_Hepg2-PoisedPromoter"  
##  [37] "hg19_chromatin_Hmec-PoisedPromoter"   
##  [38] "hg19_chromatin_Hsmm-PoisedPromoter"   
##  [39] "hg19_chromatin_Huvec-PoisedPromoter"  
##  [40] "hg19_chromatin_K562-PoisedPromoter"   
##  [41] "hg19_chromatin_Nhek-PoisedPromoter"   
##  [42] "hg19_chromatin_Nhlf-PoisedPromoter"   
##  [43] "hg19_chromatin_Gm12878-StrongEnhancer"
##  [44] "hg19_chromatin_H1hesc-StrongEnhancer" 
##  [45] "hg19_chromatin_Hepg2-StrongEnhancer"  
##  [46] "hg19_chromatin_Hmec-StrongEnhancer"   
##  [47] "hg19_chromatin_Hsmm-StrongEnhancer"   
##  [48] "hg19_chromatin_Huvec-StrongEnhancer"  
##  [49] "hg19_chromatin_K562-StrongEnhancer"   
##  [50] "hg19_chromatin_Nhek-StrongEnhancer"   
##  [51] "hg19_chromatin_Nhlf-StrongEnhancer"   
##  [52] "hg19_chromatin_Gm12878-WeakEnhancer"  
##  [53] "hg19_chromatin_H1hesc-WeakEnhancer"   
##  [54] "hg19_chromatin_Hepg2-WeakEnhancer"    
##  [55] "hg19_chromatin_Hmec-WeakEnhancer"     
##  [56] "hg19_chromatin_Hsmm-WeakEnhancer"     
##  [57] "hg19_chromatin_Huvec-WeakEnhancer"    
##  [58] "hg19_chromatin_K562-WeakEnhancer"     
##  [59] "hg19_chromatin_Nhek-WeakEnhancer"     
##  [60] "hg19_chromatin_Nhlf-WeakEnhancer"     
##  [61] "hg19_chromatin_Gm12878-Insulator"     
##  [62] "hg19_chromatin_H1hesc-Insulator"      
##  [63] "hg19_chromatin_Hepg2-Insulator"       
##  [64] "hg19_chromatin_Hmec-Insulator"        
##  [65] "hg19_chromatin_Hsmm-Insulator"        
##  [66] "hg19_chromatin_Huvec-Insulator"       
##  [67] "hg19_chromatin_K562-Insulator"        
##  [68] "hg19_chromatin_Nhek-Insulator"        
##  [69] "hg19_chromatin_Nhlf-Insulator"        
##  [70] "hg19_chromatin_Gm12878-TxnTransition" 
##  [71] "hg19_chromatin_H1hesc-TxnTransition"  
##  [72] "hg19_chromatin_Hepg2-TxnTransition"   
##  [73] "hg19_chromatin_Hmec-TxnTransition"    
##  [74] "hg19_chromatin_Hsmm-TxnTransition"    
##  [75] "hg19_chromatin_Huvec-TxnTransition"   
##  [76] "hg19_chromatin_K562-TxnTransition"    
##  [77] "hg19_chromatin_Nhek-TxnTransition"    
##  [78] "hg19_chromatin_Nhlf-TxnTransition"    
##  [79] "hg19_chromatin_Gm12878-TxnElongation" 
##  [80] "hg19_chromatin_H1hesc-TxnElongation"  
##  [81] "hg19_chromatin_Hepg2-TxnElongation"   
##  [82] "hg19_chromatin_Hmec-TxnElongation"    
##  [83] "hg19_chromatin_Hsmm-TxnElongation"    
##  [84] "hg19_chromatin_Huvec-TxnElongation"   
##  [85] "hg19_chromatin_K562-TxnElongation"    
##  [86] "hg19_chromatin_Nhek-TxnElongation"    
##  [87] "hg19_chromatin_Nhlf-TxnElongation"    
##  [88] "hg19_chromatin_Gm12878-WeakTxn"       
##  [89] "hg19_chromatin_H1hesc-WeakTxn"        
##  [90] "hg19_chromatin_Hepg2-WeakTxn"         
##  [91] "hg19_chromatin_Hmec-WeakTxn"          
##  [92] "hg19_chromatin_Hsmm-WeakTxn"          
##  [93] "hg19_chromatin_Huvec-WeakTxn"         
##  [94] "hg19_chromatin_K562-WeakTxn"          
##  [95] "hg19_chromatin_Nhek-WeakTxn"          
##  [96] "hg19_chromatin_Nhlf-WeakTxn"          
##  [97] "hg19_chromatin_Gm12878-Repressed"     
##  [98] "hg19_chromatin_H1hesc-Repressed"      
##  [99] "hg19_chromatin_Hepg2-Repressed"       
## [100] "hg19_chromatin_Hmec-Repressed"        
## [101] "hg19_chromatin_Hsmm-Repressed"        
## [102] "hg19_chromatin_Huvec-Repressed"       
## [103] "hg19_chromatin_K562-Repressed"        
## [104] "hg19_chromatin_Nhek-Repressed"        
## [105] "hg19_chromatin_Nhlf-Repressed"        
## [106] "hg19_chromatin_Gm12878-Heterochrom/lo"
## [107] "hg19_chromatin_H1hesc-Heterochrom/lo" 
## [108] "hg19_chromatin_Hepg2-Heterochrom/lo"  
## [109] "hg19_chromatin_Hmec-Heterochrom/lo"   
## [110] "hg19_chromatin_Hsmm-Heterochrom/lo"   
## [111] "hg19_chromatin_Huvec-Heterochrom/lo"  
## [112] "hg19_chromatin_K562-Heterochrom/lo"   
## [113] "hg19_chromatin_Nhek-Heterochrom/lo"   
## [114] "hg19_chromatin_Nhlf-Heterochrom/lo"   
## [115] "hg19_chromatin_Gm12878-Repetitive/CNV"
## [116] "hg19_chromatin_H1hesc-Repetitive/CNV" 
## [117] "hg19_chromatin_Hepg2-Repetitive/CNV"  
## [118] "hg19_chromatin_Hmec-Repetitive/CNV"   
## [119] "hg19_chromatin_Hsmm-Repetitive/CNV"   
## [120] "hg19_chromatin_Huvec-Repetitive/CNV"  
## [121] "hg19_chromatin_K562-Repetitive/CNV"   
## [122] "hg19_chromatin_Nhek-Repetitive/CNV"   
## [123] "hg19_chromatin_Nhlf-Repetitive/CNV"   
## [124] "hg19_enhancers_fantom"                
## [125] "hg19_lncrna_gencode"                  
## [126] "hg19_basicgenes"                      
## [127] "hg19_cpgs"                            
## [128] "hg19_Gm12878-chromatin"               
## [129] "hg19_H1hesc-chromatin"                
## [130] "hg19_Hepg2-chromatin"                 
## [131] "hg19_Hmec-chromatin"                  
## [132] "hg19_Hsmm-chromatin"                  
## [133] "hg19_Huvec-chromatin"                 
## [134] "hg19_K562-chromatin"                  
## [135] "hg19_Nhek-chromatin"                  
## [136] "hg19_Nhlf-chromatin"
# annotChoices <- c("hg19_basicgenes", "hg19_enhancers_fantom")
annotChoices <- c("hg19_enhancers_fantom")
annotations = build_annotations(genome = 'hg19', annotations = annotChoices)
## Building enhancers...
grChr <- GRanges(seqnames = paste0("chr",seqnames(gr)),
                 ranges = ranges(gr),
                 strand = strand(gr),
                 mcols = mcols(gr))
names(grChr) <- names(gr)
gr_annotated = annotate_regions(
    regions = grChr,
    annotations = annotations,
    ignore.strand = TRUE,
    quiet = FALSE)
## Annotating...
print(gr_annotated)
## GRanges object with 15873 ranges and 2 metadata columns:
##               seqnames            ranges strand |          mcols.gc
##                  <Rle>         <IRanges>  <Rle> |         <numeric>
##       Peak_47     chr1   1092809-1094351      * | 0.709656513285807
##       Peak_49     chr1   1136813-1137028      * | 0.777777777777778
##       Peak_50     chr1   1143875-1144088      * | 0.672897196261682
##       Peak_67     chr1   1368511-1369085      * | 0.671304347826087
##       Peak_90     chr1   1714571-1715055      * | 0.548453608247423
##           ...      ...               ...    ... .               ...
##   Peak_302611     chrY   2871866-2872091      * | 0.455752212389381
##   Peak_302661     chrY   7337008-7337286      * | 0.347670250896057
##   Peak_302668     chrY   7592241-7592813      * | 0.443280977312391
##   Peak_303051     chrY 17567809-17568320      * |        0.66796875
##   Peak_303212     chrY 59019918-59020154      * | 0.413502109704641
##                                annot
##                            <GRanges>
##       Peak_47   chr1:1093611-1093958
##       Peak_49   chr1:1136668-1137146
##       Peak_50   chr1:1143532-1144198
##       Peak_67   chr1:1368472-1368786
##       Peak_90   chr1:1714021-1714715
##           ...                    ...
##   Peak_302611   chrY:2872047-2872325
##   Peak_302661   chrY:7337078-7337293
##   Peak_302668   chrY:7592272-7592553
##   Peak_303051 chrY:17567836-17568213
##   Peak_303212 chrY:59019813-59020026
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths
df_gr_annotated = data.frame(gr_annotated)

mean(paste0("Peak_",gcfqPeaks) %in% names(gr_annotated))
## [1] 0.05164525
mean(paste0("Peak_",fqPeaks) %in% names(gr_annotated))
## [1] 0.05135724
mean(names(gr_annotated) %in% paste0("Peak_",gcfqPeaks))
## [1] 0.6272916
mean(names(gr_annotated) %in% paste0("Peak_",fqPeaks))
## [1] 0.6882127